library(summarytools)
Warning message:
Unknown or uninitialised column: `infection_detected`.
drive_auth(use_oob = T, cache = T, email = "rogangrant2022@u.northwestern.edu") # have to run in console :(
Warning messages:
1: Unknown or uninitialised column: `infection_detected`.
2: Unknown or uninitialised column: `infection_detected`.
data = read_sheet("https://docs.google.com/spreadsheets/d/1KHgJ-ZXQAfgwp-X1U2xjOiYEa--YrR6cGV3U20TxTXo/edit?usp=sharing",
skip = 1,
trim_ws = T,
.name_repair = "universal")
Reading from "COVID19 BAL stats"
Range "2:5000000"
New names:
* `Length of ICU stay` -> Length.of.ICU.stay
#remove in-progress entries
data = subset(data, !is.na(Sample))
#mark neutrophilic patients
data$neutrophilic = data$percent_neutrophils > 40
#for subsetting later
observations = table(data$ID)
serial_patients = names(observations[observations > 1])
#adjust types
data$ID = as.character(data$ID)
# simple_md = read_excel(path = "~/Box/COVID19_BAL_flow/01_data/02_clinical_metadata/extracted_clinical_data/LMN_extracted_clinical_data_update052020.xlsx",
# sheet = "New list",
# .name_repair = "universal")
# simple_md$COVID.status = factor(simple_md$COVID.status)
# simple_md$outcome = factor(ifelse(grepl("deceased", simple_md$Discharged..d.c..or.deceased),
# yes = "deceased",
# no = "discharged"))
# simple_md$Study.ID = factor(as.character(simple_md$Study.ID))
#
# source("~/Documents/GitHub/COVID19_BAL_flow/rgrant/read_clinical_metadata_covid19.R")
# md = read_clinical_metadata_covid19()
# md = subset(md, grepl("\\d{4}-BAL-\\d{2}", Sample..)) #collected samples follow this format
#
edw_endpoints = read.csv("~/Box/RGrant/SCRIPT/200608 SCRIPT Basic Endpoints.csv",
strip.white = T,
check.names = T,
na.strings = c("", "NA"))
date_cols = colnames(edw_endpoints)[grepl("date", colnames(edw_endpoints), ignore.case = T) |
grepl("\\_dt", colnames(edw_endpoints))]
edw_endpoints = edw_endpoints %>%
mutate_at(.vars = date_cols,
.funs = function(x){
as.Date(x, format = "%m/%d/%y %H:%M", tz = "CST") })
edw_endpoints$pt_study_id = as.character(edw_endpoints$pt_study_id)
#simplify outcome data
edw_endpoints$binned_outcome = factor(vapply(edw_endpoints$discharge_disposition_name,
function(x)
{
if(x == "Expired")
{
return("Deceased")
} else if(grepl("^Home", x))
{
return("Discharged")
} else if(x %in% c("Acute Care Hospital", "Group Home", "Inpatient Hospice",
"Planned Readmission – DC/transferred to acute inpatient rehab",
"Acute Inpatient Rehabilitation", "Long-Term Acute Care Hospital (LTAC)",
"Skilled Nursing Facility or Subacute Rehab Care"))
{
return("Inpatient Facility")
} else if(is.na(x) | x == "unknown")
{
return(as.character(NA))
} else
{
return("Other")
}}, FUN.VALUE = "char"))
edw_endpoints = edw_endpoints %>%
select(-full_name)
# edw_micro = read_excel("~/Box/RGrant/SCRIPT/200526 SCRIPT Microbiology BAL Results.xlsx",
# skip = 7,
# .name_repair = "universal")
# keep_cols = apply(edw_micro, 2, function(x){ !all(is.na(x)) })
# edw_micro = edw_micro[, keep_cols]
# edw_micro$order.datetime = as.Date(edw_micro$order.datetime, origin = "1899-12-30") #excel date format
# edw_micro = subset(edw_micro, !is.na(order.datetime))
# edw_micro$infection_detected = factor(!is.na(edw_micro$organism.name))
# edw_micro$organism.quantity[grepl(">", edw_micro$organism.quantity)] = "10000"
# edw_micro$organism.quantity = as.numeric(edw_micro$organism.quantity)
# #summarize for easy binding, get rid of garbage info
# edw_micro = edw_micro %>%
# group_by(pt.study.id, order.datetime) %>%
# dplyr::summarize(detected_organisms = list(organism.name),
# organism_quantities = list(organism.quantity)) %>%
# rowwise() %>%
# mutate(infection_detected = any(!is.na(detected_organisms)))
#
# #merge all together
# data$sample_id = substring(data$Sample,
# 10,
# 20)
# md$Study.ID = as.character(md$Study.ID)
# #need for joining, fixed after
# edw_micro$order.datetime = as.character(edw_micro$order.datetime)
# md$BAL.Date = as.character(md$BAL.Date)
# data = data %>%
# left_join(., md, by = c("sample_id" = "Sample..", "ID" = "Study.ID")) %>%
# left_join(., simple_md, by = c("ID" = "Study.ID")) %>%
# left_join(., edw_endpoints, by = c("ID" = "pt_study_id")) %>%
# left_join(., edw_micro, by = c("ID" = "pt.study.id", "BAL.Date" = "order.datetime"))
# only_metadata = md %>%
# left_join(., simple_md, by = "Study.ID") %>%
# left_join(., edw_endpoints, by = c("Study.ID" = "pt_study_id")) %>%
# left_join(., edw_micro, by = c("Study.ID" = "pt.study.id", "BAL.Date" = "order.datetime"))
# only_metadata = unique(only_metadata)
# data$BAL.Date = as.Date(data$BAL.Date)
# edw_micro$order.datetime = as.Date(edw_micro$order.datetime)
#add EDW molecular data
edw_molecular = read.csv("~/Box/RGrant/SCRIPT/200608 SCRIPT and COVID BAL Results.csv",
na.strings = c("", "NA"),
strip.white = T,
check.names = T) %>%
select(-Name)
#make numeric values numeric
numeric_cols = c("day_of_intubation", "day_of_hospitalization", "RBC_BODY_FLUID", "WBC__BODY_FLUID", "NEUTROPHILS__BODY_FLUID",
"Absolute_Neutrophils", "TOXIC_GRANULATION", "MACROPHAGE_BF", "MONOCYTE_BF", "LYMPH_BF",
"ABSOLUTE_LYMPHOCYTES", "EOSINOPHILS__BODY_FLUID", "ABSOLUTE_EOSINOPHILS", "PLASMA_CELL_BF",
"OTHER_CELLS__BODY_FLUID", "AMYLASE_BF", "WHITE_BLOOD_CELL_COUNT",
"C_Reactive_Protein", "LDH", "CREATINE_KINASE__TOTAL", "PROCALCITONIN", "FERRITIN", "TROPONIN_I",
"Creatinine", "AST__SGOT_", "D_DIMER", "max_daily_temp")
edw_molecular = edw_molecular %>% mutate_at(.vars = numeric_cols, .funs = function(x){
x = gsub(">", "", x)
x = gsub("<", "", x)
x = gsub(",", "", x)
x = as.numeric(x)
return(x)})
#fix test columns
test_cols = colnames(edw_molecular)[c(which(colnames(edw_molecular) == "ASPERGILLUS_GALACTOMANNAN_ANTIGEN_NMH_LFH_"):which(colnames(edw_molecular) == "RESPIRATORY_SYNCYTIAL_VIRUS_RESPAN22"),
which(colnames(edw_molecular) == "STREPTOCOCCUS_PNEUMONIAE_ANTIGEN_URINE_1"),
which(colnames(edw_molecular) == "LEGIONELLA_ANTIGEN__EIA__URINE_1"))]
edw_molecular = edw_molecular %>% mutate_at(.vars = test_cols,
.funs = function(x){
x = factor(ifelse(is.na(x),
yes = NA,
no = ifelse((grepl("Not Detected", x, ignore.case = T) |
grepl("Negative", x, ignore.case = T)),
yes = "Negative",
no = "Positive")))
return(x) })
#remove one strange duplicate entry
edw_molecular = subset(edw_molecular, !(duplicated(paste(edw_molecular$ir_id, edw_molecular$BAL_order_timestamp))))
#format into long form
edw_molecular = edw_molecular %>%
pivot_longer(cols = contains("organism_"),
names_to = "microbiology_parameter",
values_to = "microbiology_value")
edw_molecular$main_microbiology_parameter = factor(gsub("_*\\d", "", edw_molecular$microbiology_parameter))
#flatten these parameters into lists of values
edw_molecular = edw_molecular %>%
group_by(ir_id, BAL_order_timestamp, main_microbiology_parameter) %>%
mutate(microbiology_value_list = list(microbiology_value)) %>% # list column
ungroup() %>%
rowwise() %>%
mutate_at(.vars = "microbiology_value_list", .funs = function(x){ #remove NA
cur = na.omit(x)
if(length(cur) == 0)
{
return(list(NULL))
} else
{
return(list(cur))
}
}) %>%
select(-c(microbiology_parameter, microbiology_value)) %>%
ungroup()
#pivot back into wide form for merging (list values get duplicated, need to fix)
listcols = as.character(unique(edw_molecular$main_microbiology_parameter))
edw_molecular = edw_molecular %>%
pivot_wider(names_from = main_microbiology_parameter,
values_from = microbiology_value_list) %>%
rowwise() %>%
#have to remove duplicated list vals
mutate_at(.vars = listcols, .funs = function(x){
return(list(x[[1]])) }) %>%
ungroup()
edw_molecular$order_accession_num = as.character(edw_molecular$order_accession_num)
#fix dates
date_cols = colnames(edw_molecular)[grepl("date", colnames(edw_molecular), ignore.case = T)]
edw_molecular = edw_molecular %>%
mutate_at(.vars = date_cols, .funs = function(x){
as.Date(x, format = "%m/%d/%y", tz = "CST")} )
#make organism quantity numeric
edw_molecular$organism_quantity = lapply(edw_molecular$organism_quantity,
function(x){
x = gsub(">", "", x)
x = gsub("<", "", x)
x = gsub(",", "", x)
x = as.numeric(x)
if(length(x) == 0)
{
return(NULL)
}
return(x)})
#import BAL data
edw_BAL = read.csv("~/Box/RGrant/SCRIPT/200608 SCRIPT BAL Related Labs.csv",
na.strings = c("", "NA"),
strip.white = T,
check.names = T) %>%
select(c(ir_id, pt_study_id, redcap_bal_dt)) %>% #these columns aren't helpful
unique()
edw_BAL$pt_study_id = as.character(edw_BAL$pt_study_id)
edw_BAL$redcap_bal_dt = as.Date(edw_BAL$redcap_bal_dt)
colnames(edw_BAL)[colnames(edw_BAL) == "bal_order_date"] = "BAL_order_date"
#import known COVID patients and all patient IDs
covid_cases = read.csv("~/Box/RGrant/SCRIPT/200608 SCRIPT_COVID_list.csv",
na.strings = c("", "NA"),
strip.white = T,
check.names = T,
colClasses = rep("character", 5))
covid_cases$covid_confirmed = T
all_patients = read.csv("~/Box/RGrant/SCRIPT/STU00204868_subjects_06_04_2020.csv",
na.strings = c("", "NA"),
strip.white = T,
check.names = T,
colClasses = rep("character", 10)) %>%
separate_rows(case.number, sep = ", ") #uncollapse ID column
patient_data = full_join(covid_cases,
all_patients,
by = c("clarity_west_mrn" = "nmhc_record_number")) %>%
select(-c(first_name, last_name, address.line.1:email, nmff_record_number, nmh_record_number,
first.name, last.name)) #some are duplicated
colnames(patient_data)[colnames(patient_data) == "case.number"] = "study_id"
#merge metadata
edw_molecular$ir_id = as.character(edw_molecular$ir_id)
edw_endpoints$ir_id = as.character(edw_endpoints$ir_id)
only_metadata = left_join(edw_molecular,
patient_data,
by = c("ir_id")) %>%
select(-MRN) %>%
full_join(.,
edw_endpoints,
by = c("ir_id", "study_id" = "pt_study_id")) %>%
select(-west_mrn)
#fix colnames
colnames(data) = gsub("\\.", "_", colnames(data)) #I like underscores
colnames(data) = gsub("_+$", "", colnames(data)) # remove trailing
colnames(data) = gsub("_+", "_", colnames(data)) #remove dup underscores
colnames(only_metadata) = gsub("\\.", "_", colnames(only_metadata))
colnames(only_metadata) = gsub("_+$", "", colnames(only_metadata))
colnames(only_metadata) = gsub("_+", "_", colnames(only_metadata))
#derive additional metadata
only_metadata$days_from_death = as.numeric(difftime(only_metadata$BAL_order_date, only_metadata$death_date, units = "days"))
only_metadata$covid_confirmed[is.na(only_metadata$covid_confirmed)] = FALSE #may be a safer way to do this
only_metadata$ventilator_days = as.numeric(difftime(only_metadata$BAL_order_date, only_metadata$first_intubation_date, units = "days"))
#merge with flow data
data$sort_date = as.Date(substring(data$Sample, 1, 8),
format = "%Y%m%d")
merge_date = rep(NA, nrow(data))
for(i in 1:nrow(data))
{
#get patient id and bal order date for each entry
cur = data[i, ]
patient = cur$ID
sort_date = cur$sort_date
latest = as.Date(sort_date)
earliest = as.Date(sort_date - 1) #24hr window
#perform matching (should just be one match per)
matches = subset(only_metadata,
study_id == patient & BAL_order_date >= earliest & BAL_order_date <= latest)
if(nrow(matches) == 0)
{
warning(paste("Unmatched sample warning. Patient:", patient, "Sort date:", sort_date))
next
} else if(nrow(matches) > 1)
{
stop(paste("Error: mutliple matches for single sample. Patient:", patient, "Sort date:", sort_date))
} else
{
merge_date[i] = as.character(matches$BAL_order_date)
}
}
data = cbind(data, merge_date)
data$merge_date = as.Date(data$merge_date)
data = left_join(data,
only_metadata,
by = c("ID" = "study_id", "merge_date" = "BAL_order_date"))
#cast into long form
data = unique(data)
mfi_cols = colnames(data)[grepl("MFI", colnames(data), ignore.case = T)]
percentage_cols = colnames(data)[grepl("percent", colnames(data), ignore.case = T)]
data_long = data %>%
pivot_longer(cols = c(mfi_cols, percentage_cols),
names_to = "flow_parameter",
values_to = "value") %>%
arrange(ID, ventilator_days) #for easy viewing later
data_long$flow_parameter = factor(data_long$flow_parameter)
#output for use with bulk
outname = paste0("~/Box/RGrant/SCRIPT/",
Sys.Date(),
"_",
"SCRIPT_clinical_metadata_processed.rds")
saveRDS(object = only_metadata,
file = outname)
outname = paste0("~/Box/RGrant/SCRIPT/",
Sys.Date(),
"_",
"SCRIPT_flow_plus_clinical_metadata_processed.rds")
saveRDS(object = data,
file = outname)
data
dfSummary(summary_data, plain.ascii = FALSE, style = "grid",
graph.magnif = 0.75, valid.col = FALSE, tmp.img.dir = "/tmp")
Dimensions: 869 x 76
Duplicates: 1
| No | Variable | Stats / Values | Freqs (% of Valid) | Graph | Missing |
|---|---|---|---|---|---|
| 1 | ir_id [character] |
1. 2161703 2. 4132754 3. 14931516 4. 1037422 5. 187122 6. 1335514 7. 15023765 8. 1906115 9. 1910810 10. 2211247 [ 507 others ] |
14 ( 1.6%) 14 ( 1.6%) 13 ( 1.5%) 9 ( 1.0%) 9 ( 1.0%) 7 ( 0.8%) 7 ( 0.8%) 7 ( 0.8%) 7 ( 0.8%) 7 ( 0.8%) 775 (89.2%) |
0 (0%) |
|
| 2 | first_intubation_date [Date] |
min : 2020-03-04 med : 2020-04-16 max : 2020-06-04 range : 3m 0d |
87 distinct values | 260 (29.92%) |
|
| 3 | hosp_admission_date [Date] |
min : 2020-03-01 med : 2020-04-13 max : 2020-06-04 range : 3m 3d |
91 distinct values | 254 (29.23%) |
|
| 4 | hosp_disch_date [Date] |
min : 2020-03-21 med : 2020-05-08 max : 2020-06-07 range : 2m 17d |
69 distinct values | 428 (49.25%) |
|
| 5 | order_accession_num [character] |
1. 12009204212 2. 12006403902 3. 12006502587 4. 12006704644 5. 12006801307 6. 12006802667 7. 12006905404 8. 12006910597 9. 12007004294 10. 12007104458 [ 604 others ] |
2 ( 0.3%) 1 ( 0.2%) 1 ( 0.2%) 1 ( 0.2%) 1 ( 0.2%) 1 ( 0.2%) 1 ( 0.2%) 1 ( 0.2%) 1 ( 0.2%) 1 ( 0.2%) 604 (98.2%) |
254 (29.23%) |
|
| 6 | BAL_order_timestamp [character] |
1. 6/2/2020 2:08:00 PM 2. 3/22/2020 1:33:00 PM 3. 4/1/2020 4:41:00 PM 4. 4/21/2020 1:33:00 PM 5. 5/16/2020 1:39:00 AM 6. 3/10/2020 10:32:00 AM 7. 3/11/2020 8:40:00 AM 8. 3/12/2020 11:04:00 AM 9. 3/13/2020 5:25:00 PM 10. 3/14/2020 12:49:00 PM [ 599 others ] |
3 ( 0.5%) 2 ( 0.3%) 2 ( 0.3%) 2 ( 0.3%) 2 ( 0.3%) 1 ( 0.2%) 1 ( 0.2%) 1 ( 0.2%) 1 ( 0.2%) 1 ( 0.2%) 599 (97.4%) |
254 (29.23%) |
|
| 7 | BAL_order_date [Date] |
min : 2020-03-04 med : 2020-04-24 max : 2020-06-05 range : 3m 1d |
93 distinct values | 254 (29.23%) |
|
| 8 | procedure_name [character] |
1. CULTURE: RESPIRATORY W/GR 2. CULTURE: RESPIRATORY W/GR |
530 (86.2%) 85 (13.8%) |
254 (29.23%) |
|
| 9 | day_of_intubation [numeric] |
Mean (sd) : 8.6 (12.3) min < med < max: -8 < 4 < 89 IQR (CV) : 12 (1.4) |
55 distinct values | 260 (29.92%) |
|
| 10 | day_of_hospitalization [numeric] |
Mean (sd) : 10.5 (12.3) min < med < max: 0 < 7 < 90 IQR (CV) : 13 (1.2) |
57 distinct values | 254 (29.23%) |
|
| 11 | RBC_BODY_FLUID [numeric] |
Mean (sd) : 8340.9 (24291.6) min < med < max: 0 < 1687.5 < 331000 IQR (CV) : 5756.5 (2.9) |
409 distinct values | 361 (41.54%) |
|
| 12 | WBC_BODY_FLUID [numeric] |
Mean (sd) : 1611.5 (4932.9) min < med < max: 0 < 278 < 53750 IQR (CV) : 753 (3.1) |
409 distinct values | 326 (37.51%) |
|
| 13 | NEUTROPHILS_BODY_FLUID [numeric] |
Mean (sd) : 58.3 (30) min < med < max: 0 < 65 < 100 IQR (CV) : 53 (0.5) |
101 distinct values | 302 (34.75%) |
|
| 14 | Absolute_Neutrophils [numeric] |
Mean (sd) : 10.6 (7.1) min < med < max: 0 < 9.2 < 52.6 IQR (CV) : 7.9 (0.7) |
196 distinct values | 394 (45.34%) |
|
| 15 | TOXIC_GRANULATION [numeric] |
All NA’s | 869 (100%) |
||
| 16 | MACROPHAGE_BF [numeric] |
Mean (sd) : 18.5 (19.8) min < med < max: 1 < 10 < 98 IQR (CV) : 23 (1.1) |
79 distinct values | 348 (40.05%) |
|
| 17 | MONOCYTE_BF [numeric] |
Mean (sd) : 7.5 (8.9) min < med < max: 0 < 5 < 73 IQR (CV) : 7 (1.2) |
38 distinct values | 377 (43.38%) |
|
| 18 | LYMPH_BF [numeric] |
Mean (sd) : 13.8 (16.6) min < med < max: 1 < 6 < 95 IQR (CV) : 16.5 (1.2) |
66 distinct values | 362 (41.66%) |
|
| 19 | ABSOLUTE_LYMPHOCYTES [numeric] |
Mean (sd) : 1.2 (1) min < med < max: 0 < 1 < 7.8 IQR (CV) : 0.9 (0.9) |
48 distinct values | 391 (44.99%) |
|
| 20 | EOSINOPHILS_BODY_FLUID [numeric] |
Mean (sd) : 3.5 (6.2) min < med < max: 0 < 1 < 40 IQR (CV) : 2 (1.8) |
20 distinct values | 700 (80.55%) |
|
| 21 | ABSOLUTE_EOSINOPHILS [numeric] |
Mean (sd) : 0.1 (0.2) min < med < max: 0 < 0 < 1.8 IQR (CV) : 0.2 (1.7) |
16 distinct values | 390 (44.88%) |
|
| 22 | PLASMA_CELL_BF [numeric] |
Mean (sd) : 6.4 (10) min < med < max: 0 < 2 < 39 IQR (CV) : 5 (1.6) |
13 distinct values | 838 (96.43%) |
|
| 23 | OTHER_CELLS_BODY_FLUID [numeric] |
Mean (sd) : 9.6 (14.3) min < med < max: 0 < 4 < 100 IQR (CV) : 9 (1.5) |
45 distinct values | 579 (66.63%) |
|
| 24 | AMYLASE_BF [numeric] |
Mean (sd) : 928.4 (5284.2) min < med < max: 10 < 43 < 85300 IQR (CV) : 149 (5.7) |
158 distinct values | 553 (63.64%) |
|
| 25 | WHITE_BLOOD_CELL_COUNT [numeric] |
Mean (sd) : 13.3 (8.1) min < med < max: 0.1 < 11.8 < 58.4 IQR (CV) : 8.8 (0.6) |
227 distinct values | 262 (30.15%) |
|
| 26 | C_Reactive_Protein [numeric] |
Mean (sd) : 16.5 (11.5) min < med < max: 0.5 < 15.6 < 52 IQR (CV) : 19 (0.7) |
243 distinct values | 463 (53.28%) |
|
| 27 | LDH [numeric] |
Mean (sd) : 486.6 (677) min < med < max: 142 < 372 < 12000 IQR (CV) : 240 (1.4) |
306 distinct values | 434 (49.94%) |
|
| 28 | CREATINE_KINASE_TOTAL [numeric] |
Mean (sd) : 748.5 (2261.8) min < med < max: 10 < 164.5 < 20000 IQR (CV) : 412.5 (3) |
256 distinct values | 525 (60.41%) |
|
| 29 | PROCALCITONIN [numeric] |
Mean (sd) : 6.5 (31.7) min < med < max: 0 < 0.7 < 482.8 IQR (CV) : 2.6 (4.9) |
366 distinct values | 407 (46.84%) |
|
| 30 | FERRITIN [numeric] |
Mean (sd) : 1390 (2187.3) min < med < max: 4.6 < 748.9 < 18123.6 IQR (CV) : 1129.4 (1.6) |
343 distinct values | 521 (59.95%) |
|
| 31 | TROPONIN_I [numeric] |
Mean (sd) : 0.2 (0.9) min < med < max: 0 < 0 < 12.7 IQR (CV) : 0.1 (4.6) |
60 distinct values | 474 (54.55%) |
|
| 32 | Creatinine [numeric] |
Mean (sd) : 1.6 (1.8) min < med < max: 0.2 < 1.1 < 17.4 IQR (CV) : 1.1 (1.1) |
260 distinct values | 256 (29.46%) |
|
| 33 | AST_SGOT [numeric] |
Mean (sd) : 92.6 (442.8) min < med < max: 9 < 42 < 10000 IQR (CV) : 41.2 (4.8) |
150 distinct values | 297 (34.18%) |
|
| 34 | D_DIMER [numeric] |
Mean (sd) : 3596.9 (6900.3) min < med < max: 150 < 1166 < 59591 IQR (CV) : 2520.8 (1.9) |
382 distinct values | 461 (53.05%) |
|
| 35 | max_daily_temp [numeric] |
Mean (sd) : 100.3 (1.7) min < med < max: 93.1 < 100 < 108.7 IQR (CV) : 2.4 (0) |
79 distinct values | 255 (29.34%) |
|
| 36 | nmh_mrn [character] |
1. 000666356973 2. 000102901847 3. 000102042781 4. 000103716321 5. 000101880584 6. 000102011706 7. 000700785449 8. 000102209496 9. 000103275513 10. 000666253942 [ 37 others ] |
14 (10.6%) 9 ( 6.8%) 7 ( 5.3%) 7 ( 5.3%) 5 ( 3.8%) 5 ( 3.8%) 5 ( 3.8%) 4 ( 3.0%) 4 ( 3.0%) 4 ( 3.0%) 68 (51.5%) |
737 (84.81%) |
|
| 37 | clarity_west_mrn [character] |
1. 009285802 2. 008278762 3. 006891929 4. 009451861 5. 111011466718 6. 006678950 7. 007822933 8. 007934943 9. 007261558 10. 007941432 [ 54 others ] |
14 ( 8.0%) 9 ( 5.1%) 7 ( 4.0%) 7 ( 4.0%) 7 ( 4.0%) 5 ( 2.8%) 5 ( 2.8%) 5 ( 2.8%) 4 ( 2.3%) 4 ( 2.3%) 109 (61.9%) |
693 (79.75%) |
|
| 38 | nmff_mrn [character] |
1. 10219238 2. 08553029 3. 0102042781 4. 10379710 5. 000700785449 6. 08164707 7. 08206840 8. 0353426504 9. 08310911 10. 08370656 [ 37 others ] |
14 (10.6%) 9 ( 6.8%) 7 ( 5.3%) 7 ( 5.3%) 5 ( 3.8%) 5 ( 3.8%) 5 ( 3.8%) 4 ( 3.0%) 4 ( 3.0%) 4 ( 3.0%) 68 (51.5%) |
737 (84.81%) |
|
| 39 | covid_confirmed [logical] |
1. FALSE 2. TRUE |
693 (79.8%) 176 (20.2%) |
0 (0%) |
|
| 40 | study [character] |
1. STU00204868 | 176 (100.0%) | 693 (79.75%) |
|
| 41 | study_id [character] |
1. 1255 2. 1248 3. 1270 4. 1271 5. 1286 6. 1232 7. 1266 8. 1292 9. 1224 10. 1230 [ 308 others ] |
14 ( 3.3%) 9 ( 2.1%) 7 ( 1.6%) 7 ( 1.6%) 7 ( 1.6%) 5 ( 1.2%) 5 ( 1.2%) 5 ( 1.2%) 4 ( 0.9%) 4 ( 0.9%) 363 (84.4%) |
439 (50.52%) |
|
| 42 | birth_date [character] |
1. 1957-12-01 2. 1969-05-01 3. 1958-03-02 4. 1967-09-22 5. 1992-03-02 6. 1933-11-29 7. 1948-01-21 8. 1982-07-30 9. 1944-12-09 10. 1949-11-12 [ 54 others ] |
14 ( 8.0%) 9 ( 5.1%) 7 ( 4.0%) 7 ( 4.0%) 7 ( 4.0%) 5 ( 2.8%) 5 ( 2.8%) 5 ( 2.8%) 4 ( 2.3%) 4 ( 2.3%) 109 (61.9%) |
693 (79.75%) |
|
| 43 | ethnicity [character] |
1. Hispanic or Latino 2. Not Hispanic or Latino 3. Unknown or Not Reported |
72 (40.9%) 89 (50.6%) 15 ( 8.5%) |
693 (79.75%) |
|
| 44 | gender [character] |
1. Female 2. Male |
65 (36.9%) 111 (63.1%) |
693 (79.75%) |
|
| 45 | races [character] |
1. Asian 2. Black/African American 3. Unknown or Not Reported 4. White |
4 ( 2.3%) 33 (18.8%) 53 (30.1%) 86 (48.9%) |
693 (79.75%) |
|
| 46 | diagnosis [character] |
All NA’s | 869 (100%) |
||
| 47 | arms_populations [character] |
All NA’s | 869 (100%) |
||
| 48 | uuid [character] |
1. 1eaaa740-6202-0138-e6ac-0 2. 1b8c0890-5c0b-0138-e69a-0 3. 5d6921c0-7471-0138-aba2-0 4. 7f6b0b60-6ad1-0138-e6d6-0 5. f32d8cc0-6af1-0138-e6d6-0 6. 00015530-78e2-0138-abb7-0 7. 372fa3e0-503c-0138-8272-0 8. de908930-6932-0138-e6c4-0 9. 0bab7940-7ab1-0138-abb7-0 10. 1a7fe5d0-4f5e-0138-8272-0 [ 54 others ] |
14 ( 8.0%) 9 ( 5.1%) 7 ( 4.0%) 7 ( 4.0%) 7 ( 4.0%) 5 ( 2.8%) 5 ( 2.8%) 5 ( 2.8%) 4 ( 2.3%) 4 ( 2.3%) 109 (61.9%) |
693 (79.75%) |
|
| 49 | consent_dt [Date] |
All NA’s | 869 (100%) |
||
| 50 | death_date [Date] |
All NA’s | 869 (100%) |
||
| 51 | admission_datetime [Date] |
All NA’s | 869 (100%) |
||
| 52 | discharge_datetime [Date] |
All NA’s | 869 (100%) |
||
| 53 | hospital_los_days [integer] |
Mean (sd) : 26.3 (18.2) min < med < max: 1 < 23 < 120 IQR (CV) : 22 (0.7) |
64 distinct values | 510 (58.69%) |
|
| 54 | discharge_disposition_name [character] |
1. Expired 2. Acute Inpatient Rehabilit 3. unknown 4. Home with Home Health Car 5. Home or Self Care 6. Long-Term Acute Care Hosp 7. Skilled Nursing Facility 8. Home with Hospice 9. Home with Outpatient Serv 10. Inpatient Hospice [ 5 others ] |
110 (25.6%) 69 (16.0%) 65 (15.1%) 54 (12.6%) 44 (10.2%) 42 ( 9.8%) 28 ( 6.5%) 6 ( 1.4%) 3 ( 0.7%) 3 ( 0.7%) 6 ( 1.4%) |
439 (50.52%) |
|
| 55 | readm_90day_flg [integer] |
1 distinct value | 1 : 63 (100.0%) | 806 (92.75%) |
|
| 56 | index_icu_start [character] |
1. 3/22/2020 12:00:00 AM 2. 4/18/2020 12:00:00 AM 3. 4/5/2020 12:00:00 AM 4. 4/26/2020 12:00:00 AM 5. 4/29/2020 12:00:00 AM 6. 5/18/2020 12:00:00 AM 7. 4/25/2020 12:00:00 AM 8. 3/12/2020 12:00:00 AM 9. 5/14/2020 12:00:00 AM 10. 5/4/2020 12:00:00 AM [ 223 others ] |
18 ( 4.3%) 14 ( 3.3%) 12 ( 2.8%) 10 ( 2.4%) 10 ( 2.4%) 8 ( 1.9%) 7 ( 1.7%) 6 ( 1.4%) 6 ( 1.4%) 6 ( 1.4%) 325 (77.0%) |
447 (51.44%) |
|
| 57 | index_icu_stop [character] |
1. 5/22/2020 12:00:00 AM 2. 3/22/2020 12:00:00 AM 3. 5/18/2020 12:00:00 AM 4. 5/19/2020 12:00:00 AM 5. 5/23/2020 12:00:00 AM 6. 5/28/2020 12:00:00 AM 7. 3/31/2020 12:00:00 AM 8. 4/4/2020 12:00:00 AM 9. 5/21/2020 12:00:00 AM 10. 5/29/2020 12:00:00 AM [ 220 others ] |
39 ( 9.2%) 15 ( 3.6%) 14 ( 3.3%) 10 ( 2.4%) 7 ( 1.7%) 7 ( 1.7%) 6 ( 1.4%) 6 ( 1.4%) 6 ( 1.4%) 6 ( 1.4%) 306 (72.5%) |
447 (51.44%) |
|
| 58 | index_ICU_LOS_Days [integer] |
Mean (sd) : 12.7 (11.7) min < med < max: 1 < 9 < 59 IQR (CV) : 16 (0.9) |
39 distinct values | 447 (51.44%) |
|
| 59 | total_icu_los_days [integer] |
Mean (sd) : 17.7 (14.8) min < med < max: 1 < 13 < 84 IQR (CV) : 20 (0.8) |
47 distinct values | 447 (51.44%) |
|
| 60 | total_num_of_icu_stays [integer] |
Mean (sd) : 1.6 (1.1) min < med < max: 1 < 1 < 6 IQR (CV) : 1 (0.7) |
1 : 286 (67.8%) 2 : 77 (18.2%) 3 : 24 ( 5.7%) 4 : 14 ( 3.3%) 5 : 17 ( 4.0%) 6 : 4 ( 0.9%) |
447 (51.44%) |
|
| 61 | icu_readmission_flg [integer] |
1 distinct value | 1 : 136 (100.0%) | 733 (84.35%) |
|
| 62 | First_intub_start [character] |
1. 3/22/2020 4:00:00 AM 2. 4/6/2020 4:00:00 AM 3. 4/18/2020 12:00:00 PM 4. 4/18/2020 4:00:00 PM 5. 4/26/2020 4:00:00 PM 6. 3/12/2020 12:00:00 PM 7. 4/25/2020 6:30:00 AM 8. 4/29/2020 9:00:00 PM 9. 3/14/2020 11:00:00 AM 10. 3/22/2020 11:39:17 AM [ 308 others ] |
14 ( 3.3%) 9 ( 2.1%) 7 ( 1.6%) 7 ( 1.6%) 7 ( 1.6%) 5 ( 1.2%) 5 ( 1.2%) 5 ( 1.2%) 4 ( 0.9%) 4 ( 0.9%) 363 (84.4%) |
439 (50.52%) |
|
| 63 | First_intub_stop [character] |
1. 6/8/2020 5:06:06 PM 2. 3/22/2020 8:30:00 PM 3. 5/18/2020 9:30:00 AM 4. 5/29/2020 2:10:00 PM 5. 5/13/2020 9:00:00 AM 6. 5/28/2020 12:38:00 PM 7. 6/3/2020 10:23:00 PM 8. 3/29/2020 4:00:00 PM 9. 4/19/2020 4:47:00 PM 10. 4/23/2020 3:00:00 AM [ 294 others ] |
42 ( 9.8%) 14 ( 3.3%) 9 ( 2.1%) 7 ( 1.6%) 5 ( 1.2%) 5 ( 1.2%) 5 ( 1.2%) 4 ( 0.9%) 4 ( 0.9%) 4 ( 0.9%) 331 (77.0%) |
439 (50.52%) |
|
| 64 | Second_intub_start [character] |
1. 3/27/2020 3:00:00 PM 2. 5/20/2020 10:00:00 PM 3. 3/31/2020 2:00:00 AM 4. 5/9/2020 11:54:00 PM 5. 5/16/2020 4:00:00 PM 6. 6/5/2020 1:00:00 PM 7. 1/10/2019 6:00:00 PM 8. 1/10/2020 7:00:00 PM 9. 1/13/2020 8:00:00 AM 10. 1/16/2020 7:00:00 PM [ 49 others ] |
14 (15.7%) 9 (10.1%) 4 ( 4.5%) 4 ( 4.5%) 3 ( 3.4%) 2 ( 2.2%) 1 ( 1.1%) 1 ( 1.1%) 1 ( 1.1%) 1 ( 1.1%) 49 (55.1%) |
780 (89.76%) |
|
| 65 | Second_intub_stop [character] |
1. 3/31/2020 12:00:00 PM 2. 5/21/2020 2:15:00 AM 3. 4/2/2020 10:25:00 PM 4. 5/10/2020 3:22:00 AM 5. 6/2/2020 7:52:00 AM 6. 6/6/2020 9:46:00 AM 7. 1/12/2019 10:00:00 AM 8. 1/13/2020 10:20:00 AM 9. 1/16/2020 10:22:00 PM 10. 1/18/2019 3:56:00 AM [ 49 others ] |
14 (15.7%) 9 (10.1%) 4 ( 4.5%) 4 ( 4.5%) 3 ( 3.4%) 2 ( 2.2%) 1 ( 1.1%) 1 ( 1.1%) 1 ( 1.1%) 1 ( 1.1%) 49 (55.1%) |
780 (89.76%) |
|
| 66 | Third_intub_start [character] |
1. 4/3/2020 7:36:00 PM 2. 4/9/2020 4:00:00 AM 3. 6/7/2020 7:53:00 PM 4. 1/16/2019 10:00:00 AM 5. 10/2/2018 2:30:00 AM 6. 11/15/2019 9:00:00 PM 7. 12/10/2018 12:00:00 AM 8. 2/14/2020 12:00:00 AM 9. 2/15/2019 1:45:00 PM 10. 2/20/2020 12:39:00 AM [ 11 others ] |
14 (36.8%) 4 (10.5%) 2 ( 5.3%) 1 ( 2.6%) 1 ( 2.6%) 1 ( 2.6%) 1 ( 2.6%) 1 ( 2.6%) 1 ( 2.6%) 1 ( 2.6%) 11 (28.9%) |
831 (95.63%) |
|
| 67 | Third_intub_stop [character] |
1. 5/27/2020 8:00:00 AM 2. 4/13/2020 8:45:00 AM 3. 6/7/2020 8:00:00 PM 4. 1/20/2019 1:00:00 PM 5. 10/6/2018 9:04:00 AM 6. 12/16/2018 9:51:00 AM 7. 12/8/2019 11:44:00 AM 8. 2/14/2020 10:27:00 AM 9. 2/15/2020 8:10:00 AM 10. 2/19/2019 11:00:00 AM [ 11 others ] |
14 (36.8%) 4 (10.5%) 2 ( 5.3%) 1 ( 2.6%) 1 ( 2.6%) 1 ( 2.6%) 1 ( 2.6%) 1 ( 2.6%) 1 ( 2.6%) 1 ( 2.6%) 11 (28.9%) |
831 (95.63%) |
|
| 68 | tracheostomy_flg [integer] |
1 distinct value | 1 : 77 (100.0%) | 792 (91.14%) |
|
| 69 | tracheostomy_dt [Date] |
All NA’s | 869 (100%) |
||
| 70 | pt_mech_vent_redcap [integer] |
1 distinct value | 1 : 425 (100.0%) | 444 (51.09%) |
|
| 71 | days_intube_prior_enroll_redcap [integer] |
Mean (sd) : 10 (70.5) min < med < max: 0 < 1 < 999 IQR (CV) : 5 (7) |
28 distinct values | 446 (51.32%) |
|
| 72 | Has_the_patient_had_prior_non_invasive_ventilation_redcap [integer] |
Min : 0 Mean : 0.2 Max : 1 |
0 : 347 (82.0%) 1 : 76 (18.0%) |
446 (51.32%) |
|
| 73 | duration_of_niv_most_recent_redcap [numeric] |
Mean (sd) : 1.3 (2.3) min < med < max: 0 < 1 < 18 IQR (CV) : 0.8 (1.7) |
22 distinct values | 793 (91.25%) |
|
| 74 | Has_there_been_more_than_one_episode_of_NIV_redcap [integer] |
Min : 0 Mean : 0.2 Max : 1 |
0 : 59 (77.6%) 1 : 17 (22.4%) |
793 (91.25%) |
|
| 75 | pt_trache_redcap [integer] |
Min : 0 Mean : 0.1 Max : 1 |
0 : 386 (90.8%) 1 : 39 ( 9.2%) |
444 (51.09%) |
|
| 76 | binned_outcome [factor] |
1. Deceased 2. Discharged 3. Inpatient Facility 4. Other |
110 (30.1%) 108 (29.6%) 145 (39.7%) 2 ( 0.5%) |
504 (58%) |
percentage_data = subset(data_long, grepl("percent", data_long$flow_parameter))
shapiro.test(percentage_data$value)
Shapiro-Wilk normality test
data: percentage_data$value
W = 0.77072, p-value < 2.2e-16
hist(percentage_data$value, breaks = 50)
mfi_data = subset(data_long, grepl("MFI", data_long$flow_parameter))
shapiro.test(mfi_data$value)
Shapiro-Wilk normality test
data: mfi_data$value
W = 0.67565, p-value < 2.2e-16
hist(mfi_data$value, breaks = 50)
Extremely non-normal in both cases. At the very least, we need nonparametric tests. In reality, we probably need non-parametric tests or beta regression. However, this may not be true of individual parameters. First let’s see if transformations help.
serial_data = subset(percentage_data, ID %in% serial_patients &
!(flow_parameter %in% c("percent_CD4_total", "percent_CD206_total", "percent_total_CD206_low", "percent_total_CD206_high")))
ggplot(serial_data, aes(x = ventilator_days, y = value, fill = flow_parameter)) +
geom_bar(position = "stack", stat = "identity") +
facet_grid(ID ~ .) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# ggplot(serial_data, aes(x = days_from_death, y = value, fill = flow_parameter)) +
# geom_bar(position = "stack", stat = "identity") +
# facet_grid(ID ~ .) +
# theme(axis.text.x = element_text(angle = 45, hjust = 1))
# ggplot(serial_data, aes(x = days_from_extubation, y = value, fill = flow_parameter)) +
# geom_bar(position = "stack", stat = "identity") +
# facet_grid(ID ~ .) +
# theme(axis.text.x = element_text(angle = 45, hjust = 1))
As expected, there appear to be neutrophilic and non-neutrophilic trajectories captured here.
day0_percentages = subset(percentage_data, ventilator_days == 0 &
!(flow_parameter %in% c("percent_CD4_total", "percent_CD206_total", "percent_total_CD206_low", "percent_total_CD206_high")))
ggplot(day0_percentages, aes(x = ID, y = value, fill = flow_parameter)) +
geom_bar(position = "stack", stat = "identity") +
facet_grid(. ~ covid_confirmed, scales = "free_x") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
hist(asin(sqrt(percentage_data$value)), breaks = 50)
NaNs produced
shapiro.test(asin(sqrt(percentage_data$value)))
NaNs produced
Shapiro-Wilk normality test
data: asin(sqrt(percentage_data$value))
W = 0.96828, p-value = 0.0009785
Getting there!
hist(car::logit(percentage_data$value), breaks = 50)
shapiro.test(car::logit(percentage_data$value))
Shapiro-Wilk normality test
data: car::logit(percentage_data$value)
W = 0.99734, p-value = 0.1015
This is getting very close. Let’s see if individual observations are normal.
percentage_data$logit = car::logit(percentage_data$value)
percentage_data %>%
group_by(flow_parameter) %>%
dplyr::summarize(pval = shapiro.test(logit)$p.value)
`summarise()` ungrouping output (override with `.groups` argument)
ggplot(percentage_data, aes(x = logit)) +
geom_histogram() +
facet_wrap(~ flow_parameter)
We can probably work with this, actually. Ideally we should talk to a statistician at some point before publication. Use normalized values for all stats.
hist(log10(mfi_data$value), breaks = 50)
NaNs produced
shapiro.test(log10(mfi_data$value))
NaNs produced
Shapiro-Wilk normality test
data: log10(mfi_data$value)
W = 0.96945, p-value = 1.571e-10
Trimodal? But within individual parameters this should be okay. .
mfi_data$log10 = log10(mfi_data$value)
NaNs produced
mfi_data %>%
group_by(flow_parameter) %>%
dplyr::summarize(pval = shapiro.test(log10)$p.value)
`summarise()` ungrouping output (override with `.groups` argument)
ggplot(mfi_data, aes(x = log10)) +
geom_histogram() +
facet_wrap(~ flow_parameter)
Looks pretty good.
data_wide_percentage = percentage_data %>%
pivot_wider(names_from = flow_parameter,
values_from = c(logit, value))
data_wide_percentage$covid_confirmed[is.na(data_wide_percentage$covid_confirmed)] = FALSE
data_wide_percentage$covid_confirmed = factor(data_wide_percentage$covid_confirmed)
data_wide_percentage$response = data_wide_percentage %>%
select(logit_percent_CD4_total:logit_percent_others)
mancova_data = data_wide_percentage %>%
select(c(logit_percent_CD4_total:logit_percent_others,
covid_confirmed, ventilator_days)) %>%
na.omit()
mancova_fit = manova(cbind(logit_percent_CD4_total, logit_percent_CD4_Tregs, logit_percent_CD4_non_Tregs,
logit_percent_CD8_total, logit_percent_CD206_total, logit_percent_macs_CD206_high,
logit_percent_total_CD206_high, logit_percent_macs_CD206_low, logit_percent_total_CD206_low,
logit_percent_neutrophils, logit_percent_monocytes, logit_percent_others)
~ covid_confirmed * Outcomes,
na.action = "na.omit",
data = mancova_data) #add outcomes??
summary.aov(mancova_fit)
These are interesting results! With the exception of monocytes, abundance of all cells is affected by COVID-19 in some way. For CD4T, and CD8T, this appears to be purely a main effect. For Tregs and putative MoAM, there is also an interaction effect of day and infection. For TRAM, there is only an interaction effect.
Area plot
area_data = percentage_data %>%
group_by(covid_confirmed, Day, flow_parameter) %>%
dplyr::summarize(mean_pct = mean(value))
`summarise()` regrouping output by 'covid_confirmed', 'Day' (override with `.groups` argument)
area_data = subset(area_data, !(flow_parameter %in% c("percent_CD4_total", "percent_CD206_total", "percent_total_CD206_low", "percent_total_CD206_high")))
ggplot(area_data, aes(x = Day, y = mean_pct, fill = flow_parameter)) +
geom_area(alpha = 0.7) +
facet_grid(covid_confirmed ~ .)
This give a false impression between samples. A line plot may actually be better.
Grouped
ggplot(percentage_data, aes(x = ventilator_days, y = value, color = flow_parameter)) +
stat_summary(fun = mean, geom = "line") +
stat_summary(fun.data = mean_se, geom = "errorbar", width = 0.5, alpha = 0.5) +
facet_grid(Outcomes ~ covid_confirmed)
By patient
ggplot(percentage_data, aes(x = ventilator_days, y = value, color = flow_parameter)) +
geom_line() +
facet_wrap(~ ID)
Not enough data at this point to glean much of anything.
#cast into matrix of patient x flow parameter
all_day0 = subset(data, ventilator_days == 0)
matrix_data = reshape2::dcast(data = subset(data_long, ventilator_days == 0),
formula = ID ~ flow_parameter) %>%
column_to_rownames(var = "ID") #easier than removing columns
annotation_data = all_day0 %>%
select(c(ID, Outcomes, covid_confirmed)) %>%
column_to_rownames(var = "ID")
pheatmap(mat = matrix_data,
cluster_rows = T,
cluster_cols = T,
clustering_method = "ward.D2",
annotation_row = annotation_data,
scale = "column",
angle_col = 45,
breaks = seq(-3, 3, length.out=101),
color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(101))
matrix_data = reshape2::dcast(data = data_long,
formula = sample_id ~ flow_parameter) %>%
column_to_rownames(var = "sample_id") #easier than removing columns
annotation_data = data %>%
select(c(sample_id, outcome, COVID_status, ID, ventilator_days)) %>%
column_to_rownames(var = "sample_id")
pheatmap(mat = matrix_data,
cluster_rows = T,
cluster_cols = T,
clustering_method = "ward.D2",
annotation_row = annotation_data,
scale = "column",
angle_col = 45,
show_rownames = F,
breaks = seq(-3, 3, length.out=101),
color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(101))
pheatmap(mat = matrix_data,
cluster_rows = F,
cluster_cols = T,
clustering_method = "ward.D2",
annotation_row = annotation_data,
scale = "column",
angle_col = 45,
show_rownames = F,
breaks = seq(-3, 3, length.out=101),
color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(101))
only_metadata = only_metadata %>%
rowwise() %>%
mutate(infection_detected = !is.null(organism_name)) %>%
ungroup()
only_metadata$infection_detected[is.na(only_metadata$infection_detected)] = "Not Tested"
only_metadata$infection_detected = factor(only_metadata$infection_detected,
levels = c("Not Tested", "TRUE", "FALSE"))
ggplot(subset(only_metadata, !is.na(covid_confirmed)), aes(x = covid_confirmed, fill = infection_detected)) +
geom_bar(position = "fill") +
ylab("Proportion") +
scale_fill_manual(values = c("Not Tested" = "gray", "FALSE" = "cornflowerblue", "TRUE" = "firebrick"))
covid_organism_levels = only_metadata %>%
dplyr::filter(covid_confirmed == T) %>%
select(organism_quantity) %>%
unlist()
other_organism_levels = only_metadata %>%
dplyr::filter(covid_confirmed == F) %>%
select(organism_quantity) %>%
unlist()
ggplot(NULL) +
geom_density(aes(x = covid_organism_levels), fill = "firebrick", alpha = 0.5) +
geom_density(aes(x = other_organism_levels), fill = "cornflowerblue", alpha = 0.5)
covid_organisms = as.character(only_metadata %>%
dplyr::filter(covid_confirmed == T) %>%
select(organism_name) %>%
unlist() %>%
na.omit())
covid_organisms = gsub("[[:punct:]] | | [[:punct:]]|[[:punct:]]", "_", trimws(covid_organisms)) #so we can treat as one word
covid_organisms = gsub("_$", "", covid_organisms)
other_organisms = as.character(only_metadata %>%
dplyr::filter(covid_confirmed == F) %>%
select(organism_name) %>%
unlist() %>%
na.omit())
other_organisms = gsub("[[:punct:]] | | [[:punct:]]|[[:punct:]]", "_", trimws(other_organisms))
other_organisms = gsub("_$", "", other_organisms)
#make into data frame with normalized freqs
df1 = data.frame(word = names(termFreq(covid_organisms)),
freq = as.numeric(termFreq(covid_organisms)) /
sum(as.numeric(termFreq(covid_organisms))),
covid_confirmed = T)
df2 = data.frame(word = names(termFreq(other_organisms)),
freq = as.numeric(termFreq(other_organisms)) /
sum(as.numeric(termFreq(other_organisms))),
covid_confirmed = F)
#finally, plot
wordcould_df = rbind(df1, df2)
ggplot(wordcould_df, aes(label = word, size = freq)) +
geom_text_wordcloud(show.legend = F) +
facet_wrap(~ covid_confirmed)
clinical_data = only_metadata %>%
unique() %>% #oddly there is duplication
dplyr::filter(!is.na(BAL_order_date)) %>% #remove samples without BAL info
mutate(sample_id = paste(ir_id, BAL_order_timestamp, sep = "_")) %>%
select(c(sample_id,
RBC_BODY_FLUID:Absolute_Neutrophils,
MACROPHAGE_BF:OTHER_CELLS_BODY_FLUID,
AMYLASE_BF,
WHITE_BLOOD_CELL_COUNT:D_DIMER,
max_daily_temp)) %>%
column_to_rownames(var = "sample_id")
safe_cols = function(x)
{
return(all(is.finite(na.omit(x))) &
sd(x, na.rm = T) > 0 &
sum(!is.na(x)) > (length(x) / 2))
}
keep_cols = apply(clinical_data, 2, safe_cols)
pca_data = na.omit(clinical_data[, keep_cols])
#need to use formula interface to deal with NAs because nothing is ever easy
formula = paste0("~ ",
paste(colnames(pca_data), collapse = "+"))
formula = as.formula(formula)
pca = prcomp(formula = formula,
data = pca_data,
scale. = T,
na.action = na.omit)
pca_data_complete = only_metadata %>%
unique() %>%
mutate(sample_id = paste(ir_id, BAL_order_timestamp, sep = "_")) %>%
dplyr::filter(sample_id %in% rownames(pca_data)) %>%
column_to_rownames(var = "sample_id")
fviz_eig(pca)
autoplot(pca,
There were 34 warnings (use warnings() to see them)
data = pca_data_complete,
colour = "binned_outcome",
shape = "covid_confirmed",
loadings = TRUE,
loadings.colour = alpha("blue", 0.1),
loadings.label = TRUE,
loadings.label.size = 2,
loadings.label.colour = alpha("red", 0.7),
x = 1,
y = 2)
autoplot(pca,
data = pca_data_complete,
colour = "binned_outcome",
shape = "covid_confirmed",
loadings = TRUE,
loadings.colour = alpha("blue", 0.1),
loadings.label = TRUE,
loadings.label.size = 2,
loadings.label.colour = alpha("red", 0.7),
x = 2,
y = 3)
autoplot(pca,
data = pca_data_complete,
colour = "binned_outcome",
shape = "covid_confirmed",
loadings = TRUE,
loadings.colour = alpha("blue", 0.1),
loadings.label = TRUE,
loadings.label.size = 2,
loadings.label.colour = alpha("red", 0.7),
x = 3,
y = 4)
Some interesting loadings: PC1-2 is driven heavily by markers of sepsis and/or organ failure. PC3 appears to just broadly be immune response, but I find it interesting that CRP and monocytes (in this case from BAL) travel together. Are they the source of IL6? Is this a response to IL6? PC4 is similar, with influence of CRP and ferritin suggesting high levels of inflammation. Let’s also try UMAP to see if there is some hidden structure.
clinical_data_ltd = only_metadata %>%
unique() %>% #oddly there is duplication
dplyr::filter(!is.na(BAL_order_date)) %>% #remove samples without BAL info
mutate(sample_id = paste(ir_id, BAL_order_timestamp, sep = "_")) %>%
select(c(sample_id,
Absolute_Neutrophils,
WHITE_BLOOD_CELL_COUNT:D_DIMER,
max_daily_temp)) %>%
column_to_rownames(var = "sample_id")
keep_cols = apply(clinical_data_ltd, 2, safe_cols)
pca_data = na.omit(clinical_data_ltd[, keep_cols])
#need to use formula interface to deal with NAs because nothing is ever easy
formula = paste0("~ ",
paste(colnames(pca_data), collapse = "+"))
formula = as.formula(formula)
pca = prcomp(formula = formula,
data = pca_data,
scale. = T,
na.action = na.omit)
pca_data_complete = only_metadata %>%
unique() %>%
mutate(sample_id = paste(ir_id, BAL_order_timestamp, sep = "_")) %>%
dplyr::filter(sample_id %in% rownames(pca_data)) %>%
column_to_rownames(var = "sample_id")
autoplot(pca,
data = pca_data_complete,
colour = "binned_outcome",
loadings = TRUE,
loadings.colour = alpha("blue", 0.1),
loadings.label = TRUE,
loadings.label.size = 2,
loadings.label.colour = alpha("red", 0.7),
x = 1,
y = 2)
autoplot(pca,
data = pca_data_complete,
colour = "binned_outcome",
loadings = TRUE,
loadings.colour = alpha("blue", 0.1),
loadings.label = TRUE,
loadings.label.size = 2,
loadings.label.colour = alpha("red", 0.7),
x = 2,
y = 3)
autoplot(pca,
data = pca_data_complete,
colour = "binned_outcome",
loadings = TRUE,
loadings.colour = alpha("blue", 0.1),
loadings.label = TRUE,
loadings.label.size = 2,
loadings.label.colour = alpha("red", 0.7),
x = 3,
y = 4)
umap = umap(pca_data,
n_neighbors = 15,
min_dist = 0.1,
scale = "Z")
colnames(umap) = c("umap_1", "umap_2")
umap_data = cbind(pca_data_complete, umap)
This doesn’t seem to add much.
autoplot(pca,
data = pca_data_complete,
colour = "binned_outcome",
loadings = TRUE,
loadings.colour = alpha("blue", 0.1),
loadings.label = TRUE,
loadings.label.size = 2,
loadings.label.colour = alpha("red", 0.7),
x = 1,
y = 2)
autoplot(pca,
data = pca_data_complete,
colour = "binned_outcome",
loadings = TRUE,
loadings.colour = alpha("blue", 0.1),
loadings.label = TRUE,
loadings.label.size = 2,
loadings.label.colour = alpha("red", 0.7),
x = 2,
y = 3)
autoplot(pca,
data = pca_data_complete,
colour = "binned_outcome",
loadings = TRUE,
loadings.colour = alpha("blue", 0.1),
loadings.label = TRUE,
loadings.label.size = 2,
loadings.label.colour = alpha("red", 0.7),
x = 3,
y = 4)
Not actually very different from complete dataset.
autoplot(pca,
data = pca_data_complete,
colour = "binned_outcome",
loadings = TRUE,
loadings.colour = alpha("blue", 0.1),
loadings.label = TRUE,
loadings.label.size = 2,
loadings.label.colour = alpha("red", 0.7),
x = 1,
y = 2)
autoplot(pca,
data = pca_data_complete,
colour = "binned_outcome",
loadings = TRUE,
loadings.colour = alpha("blue", 0.1),
loadings.label = TRUE,
loadings.label.size = 2,
loadings.label.colour = alpha("red", 0.7),
x = 2,
y = 3)
autoplot(pca,
data = pca_data_complete,
colour = "binned_outcome",
loadings = TRUE,
loadings.colour = alpha("blue", 0.1),
loadings.label = TRUE,
loadings.label.size = 2,
loadings.label.colour = alpha("red", 0.7),
x = 3,
y = 4)
Data is still too sparse to really look into this.
heatmap_data = only_metadata %>%
Warning message:
Unknown or uninitialised column: `infection_detected`.
unique() %>% #oddly there is duplication
dplyr::filter(!is.na(BAL_order_date)) %>% #remove samples without BAL info
mutate(sample_id = paste(ir_id, BAL_order_timestamp, sep = "_")) %>%
select(c(sample_id,
Absolute_Neutrophils,
WHITE_BLOOD_CELL_COUNT:D_DIMER,
max_daily_temp)) %>%
column_to_rownames(var = "sample_id")
heatmap_metadata = only_metadata %>%
unique() %>%
mutate(sample_id = paste(ir_id, BAL_order_timestamp, sep = "_")) %>%
dplyr::filter(sample_id %in% rownames(heatmap_data)) %>%
select(sample_id, ventilator_days, binned_outcome) %>%
column_to_rownames(var = "sample_id")
pheatmap(heatmap_data,
cluster_rows = T,
cluster_cols = T,
clustering_method = "ward.D2",
annotation_row = heatmap_metadata,
scale = "column",
angle_col = 45,
breaks = seq(-5, 5, length.out=101),
color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(101))